# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from news.items import WeiboItem
from urllib.parse import urljoin
from utils.weibo_cookie import get_cookies


class WeiboSpider(scrapy.Spider):
    name = 'weibo'
    allowed_domains = ['weibo.com']
    start_urls = ['https://weibo.com/?category=0']
    image_scheme = 'http://'
    user_base_url = "https://weibo.com/"

    custom_settings = {
        'ITEM_PIPELINES': {
            'news.pipelines.WeiboCheckEmptyPipeline': 10,
            'scrapy.pipelines.images.ImagesPipeline': 100,
            'news.pipelines.WeiboSavePipeline': 300,
        }
    }

    def start_requests(self):
        for url in self.start_urls:
            request = SplashRequest(url,
                                    self.parse,
                                    args={'wait': 1, 'viewport': '1024x6480', 'timeout': 90, 'images': 0, 'resource_timeout': 10},
                                    cookies=get_cookies()
                                    )
            yield request

    def parse(self, response):
        sort = 1
        for selector in response.xpath('//div[@id="PCD_pictext_i_v5"]/ul/div'):
            item = WeiboItem()
            item["title"] = ''.join(selector.xpath('.//h3/node()').getall())
            item["link"] = selector.xpath('./@href').get()
            if item["link"] is None:
                item["link"] = selector.xpath('./div[@class="vid"]/@href').get()
            if item["link"] is None:
                item["link"] = selector.xpath('./div[@class="list_des"]/@href').get()
            item["preview_images"] = selector.xpath('.//div[contains(@class,"pic") or contains(@class, "vid")]//img/@src').getall()
            item["author"] = selector.xpath('.//div[contains(@class, "subinfo_box")]/a[2]/span/text()').get()

            author_url = selector.xpath('.//div[contains(@class, "subinfo_box")]/a[2]/@href').get()
            item["author_url"] = urljoin(self.user_base_url, author_url)

            item["author_avatar"] = selector.xpath('.//span[contains(@class, "subinfo_face")]/img/@src').get()
            item["publish_time"] = selector.xpath('.//div[contains(@class, "subinfo_box")]/span[contains(@class, "subinfo") and not(contains(@class, "subinfo_rgt"))]/text()').get()
            item["share_count"] = selector.xpath('.//span[contains(@class, "subinfo_rgt")][3]/em[last()]/text()').get()
            item["reply_count"] = selector.xpath('.//span[contains(@class, "subinfo_rgt")][2]/em[last()]/text()').get()
            item["like_count"] = selector.xpath('.//span[contains(@class, "subinfo_rgt")][1]/em[last()]/text()').get()
            item["sort"] = sort

            images = item["preview_images"].copy()
            images.append(item["author_avatar"])
            item["image_urls"] = [urljoin(self.image_scheme, url) for url in images]

            item["update_time"] = self.crawler.now

            sort += 1
            yield item


